This report analyzes proteomics data. The analysis includes:
Data loading and quality control
Protein and peptide quantification
Statistical analysis
Visualization of results
library(ggplot2)
library(dplyr)
custom_theme <- function() {
theme_bw(base_size = 10) +
theme(
# Vertical x-axis labels (applies to ALL plots)
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
# Unified text sizes
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 11),
legend.title = element_text(size = 12),
# Other consistent styling
legend.position = "bottom",
panel.grid.major.y = element_line(color = "gray90")
)
}
# Make this theme apply to all plots automatically
theme_set(custom_theme())
library(tidyverse)
library(ggrepel)
library(DEP)
library(missForest)
library(pheatmap)
library(openxlsx)
library(RColorBrewer)
library(ggpubr)
library(dichromat)
library(factoextra)
library(ggplotify)
library(OpenImageR)
library(svDialogs)
library(raster)
library(viridis)
library(patchwork)
library(here)
if (!requireNamespace("preprocessCore", quietly = TRUE)) {
install.packages("preprocessCore")
}
library(preprocessCore)
# Set working directory to script location
sampleTypeDir <- file.path(dirname(rstudioapi::getSourceEditorContext()$path))
setwd(sampleTypeDir)
default_dir <- paste0(sampleTypeDir, "/*.*")
# Load custom functions
source("functions_ver2.3.R")
# Select input files interactively
protein_file_name <- choose.files(
default = default_dir,
caption = "Please, select proteins file",
multi = FALSE
)
file_loc <- stringr::str_replace(protein_file_name, pattern = "_Proteins.txt", replacement = "")
sample_list_file_name <- choose.files(
default = default_dir,
caption = "Please, select sample list file",
multi = FALSE
)
# Define peptides file
peptides_file_name <- paste0(file_loc, "_PeptideGroups.txt")
projectName <- t(data.frame(strsplit(protein_file_name, split = "\\\\")))
projectName <- projectName[,ncol(projectName)]
projectName <- stringr::str_replace(projectName, "_Proteins.txt", replacement = "")
# Load protein data
proteinGroups <- read.delim(protein_file_name, check.names = FALSE)
peptides <- read.delim(peptides_file_name, check.names = FALSE)
# Load sample list
sample_list <- read.delim(sample_list_file_name, check.names = FALSE)
group_levels <- unique(sample_list$Group)
# If more than 2 groups, define which to use
if (length(group_levels) != 2) {
# Specify the two groups you want to analyze
selected_groups <- c("Prototype", "MPSP")
# Check they exist in the data
if (!all(selected_groups %in% group_levels)) {
stop("Selected groups not found in the sample list. Available groups: ", paste(group_levels, collapse = ", "))
}
# Filter to only selected groups
sample_list <- sample_list %>% filter(Group %in% selected_groups)
group_levels <- selected_groups
}
# Assign groups
GroupA <- group_levels[1]
GroupB <- group_levels[2]
ratio_name <- paste0(GroupA, "/", GroupB)
# Final filtering to ensure only the two groups are used
sample_list <- sample_list %>% filter(Group %in% c(GroupA, GroupB))
## Automatically detect group order
ordered_groups <- unique(sample_list$Group)
# Automatically detect the 2 group names from sample_list
ordered_groups <- sort(unique(sample_list$Group)) # Alphabetical or custom logic
# Color Palette
group_names <- unique(sample_list$Group)
#extended_palette <- brewer.pal(n = max(3, length(group_names)), name = "Paired")
extendet_pallete <- colorRampPalette(brewer.pal(12, name = "Paired"))(length(group_names))
group_colors <- setNames(extendet_pallete[1:length(group_names)], group_names)
no_peptides <- 2
is_master_protein <- TRUE
### Prepare Protein Data
# Select columns for protein file
list_of_columns_proteins <- c(
"Number",
"Accession",
"Description",
"Master",
"Coverage [%]",
"# Peptides",
"# PSMs",
"# Unique Peptides",
"MW [kDa]",
"Contaminant",
"# Protein Groups",
"Sequence"
)
# Verify all requested columns exist
missing_cols <- setdiff(list_of_columns_proteins, names(proteinGroups))
if(length(missing_cols) > 0) {
warning(paste("The following columns are missing:", paste(missing_cols, collapse=", ")))
}
# Select the identification columns
proteinGroups_identification <- proteinGroups %>%
dplyr::select(any_of(list_of_columns_proteins), starts_with("Found in Sample:"))
# Select abundance columns - note your actual columns start with "Abundance:" not "Abundances:"
proteinGroups_abundances <- proteinGroups %>%
dplyr::select(starts_with("Abundance:"))
# Handle empty values
proteinGroups_abundances[proteinGroups_abundances == ''] <- NA
# Select only samples from sample list
proteinGroups_abundances <- proteinGroups_abundances %>%
dplyr::select(all_of(sample_list$RawFile))
# Update sample names
new_sample_names <- sample_list$Sample
colnames(proteinGroups_abundances) <- new_sample_names
# Merge identification and abundance data
proteinGroups_merged <- cbind(proteinGroups_identification, proteinGroups_abundances)
# Apply filtering criteria
filtered <- proteinGroups_merged
if(is_master_protein == TRUE){
filtered1 <- filtered[filtered$Master == "IsMasterProtein",]
} else {
filtered1 <- filtered
}
filtered2 <- filtered1[filtered1$`# Peptides` >= no_peptides,]
filtered3 <- filtered2[filtered2$Contaminant != "True",]
proteinGroups_filtered <- filtered3
# Filter Peptides
peptides_filtered <- peptides %>%
dplyr::select(Sequence, starts_with("Abundance:"), "XCorr (by Search Engine): Sequest HT") %>%
filter(`XCorr (by Search Engine): Sequest HT` >= 1.5) %>%
dplyr::select(Sequence, starts_with("Abundance:"))
# Rename columns to match sample names
colnames(peptides_filtered) <- c("Sequence", sample_list$Sample)
# Prepare extended color palette
extendet_pallete <- colorRampPalette(brewer.pal(8, name = "Dark2"))(
length(unique(sample_list$Group))
)
# Protein quantification
proteinGroups_filtered_quality <- proteinGroups_filtered %>%
dplyr::select(all_of(sample_list$Sample))
quantified_proteins <- data.frame(colSums(!is.na(proteinGroups_filtered_quality)))
quantified_proteins <- rownames_to_column(quantified_proteins)
colnames(quantified_proteins) <- c("Sample", "Quantified_Proteins")
quantified_proteins$Group <- sample_list$Group
## Number of Quantified Proteins
quantified_proteins$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
if(nrow(sample_list) > 20){
plot_quant_proteins <- ggplot(quantified_proteins, aes(x = Sample, y = Quantified_Proteins))+
geom_point(aes(color = Group), size = 2)+
geom_segment(aes(x = Sample, xend = Sample, y = 0, yend = Quantified_Proptides, color = Group))+
theme_classic()+
labs(y = "Number of filtered and quantified proteins", x = NULL)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
scale_color_manual(values = group_colors)+
geom_text(aes(label=Quantified_Proteins), vjust= -1.0, hjust = 0.5, angle = 90,
color="black", size=4)
} else {
plot_quant_proteins <- ggplot(quantified_proteins,
aes(x = Sample, y = Quantified_Proteins, fill = Group))+
geom_bar(stat = "identity", position = "dodge")+
geom_text(aes(label=Quantified_Proteins), vjust=0.5, hjust = 1.5, angle = 90,
color="white", size=4)+
theme_classic()+
labs(y = "Number of filtered and quantified proteins", x = NULL)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
scale_fill_manual(values = group_colors)
}
plot_quant_proteins
# Save both PNG and SVG
ggsave(
filename = "Number of filtered and quantified proteins.png",
plot = plot_quant_proteins,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Number of filtered and quantified proteins.svg",
plot = plot_quant_proteins,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
peptides_filtered_quality <- peptides_filtered %>%
dplyr::select(all_of(sample_list$Sample))
quantified_peptides <- data.frame(colSums(!is.na(peptides_filtered_quality)))
quantified_peptides <- rownames_to_column(quantified_peptides)
colnames(quantified_peptides) <- c("Sample", "quantified_peptides")
quantified_peptides$Group <- sample_list$Group
quantified_peptides$Group <- factor(quantified_peptides$Group, levels = unique(quantified_peptides$Group))
# Ensure group ordering
quantified_peptides$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
plot_quant_peptides <- ggplot(quantified_peptides, aes(x = Sample, y = quantified_peptides, fill = Group))+
geom_bar(stat = "identity", position = "dodge")+
geom_text(aes(label=quantified_peptides), vjust=0.5, hjust = 1.5, angle = 90, color="white",
position = position_dodge(0.9), size=4)+
theme_classic()+
labs(y = "Number of filtered and quantified peptides", x = NULL)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
scale_fill_manual(values = group_colors)
plot_quant_peptides
# Save both PNG and SVG
ggsave(
filename = "Number of filtered and quantified peptides.png",
plot = plot_quant_peptides,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Number of filtered and quantified peptides.svg",
plot = plot_quant_peptides,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
# Create export folder if it doesn’t exist
export_dir <- "exported_images"
if (!dir.exists(export_dir)) {
dir.create(export_dir)
}
# Peptides
quantified_peptides$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
quant_peptides_average <- quantified_peptides %>%
dplyr::group_by(Group) %>%
dplyr::summarise(Mean = mean(quantified_peptides), SD = sd(quantified_peptides))
quant_peptides_average$Group <- factor(quant_peptides_average$Group, levels = unique(quant_peptides_average$Group))
p_peptides <- ggplot(quant_peptides_average, aes(x=Group, y=Mean, fill=Group)) +
geom_bar(stat="identity", color="black", position=position_dodge()) +
geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=.2, position=position_dodge(.9)) +
labs(y = "Average number of quantified peptides") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))+
scale_fill_manual(values = group_colors)
p_peptides
# Save both PNG and SVG
ggsave(
filename = "Average number of quantified peptides.png",
plot = p_peptides,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Average number of quantified peptides.svg",
plot = p_peptides,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
#Proteins
quantified_proteins$Group <- factor(quantified_proteins$Group, levels = ordered_groups)
quant_proteins_average <- quantified_proteins %>%
dplyr::group_by(Group) %>%
dplyr::summarise(Mean = mean(Quantified_Proteins),
SD = sd(Quantified_Proteins))
quant_proteins_average$Group <- factor(quant_proteins_average$Group,
levels = unique(quant_proteins_average$Group))
p_proteins <- ggplot(quant_proteins_average, aes(x=Group, y=Mean, fill=Group)) +
geom_bar(stat="identity", color="black", position=position_dodge(), width=0.7) +
geom_bar(stat="identity", color="black", position=position_dodge()) +
geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=.2, position=position_dodge(.9)) +
labs(y = "Average number of quantified proteins") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))+
scale_fill_manual(values = group_colors)
p_proteins
# Save both PNG and SVG
ggsave(
filename = "Average number of quantified proteins.png",
plot = p_proteins,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Average number of quantified proteins.svg",
plot = p_proteins,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
# Create Number column if it doesn't exist in BOTH data frames
if(!"Number" %in% names(proteinGroups_merged)) {
proteinGroups_merged$Number <- seq_len(nrow(proteinGroups_merged))
}
if(!"Number" %in% names(proteinGroups_filtered)) {
proteinGroups_filtered$Number <- seq_len(nrow(proteinGroups_filtered))
}
# Define column order
list_of_columns_proteins_order <- c(
"Number", "Accession", "Description", "MW [kDa]",
colnames(proteinGroups_abundances),
"# Peptides", "# PSMs", "# Unique Peptides", "Coverage [%]",
"Contaminant", "# Protein Groups", "Master", "Sequence"
)
# Verify columns exist in BOTH data frames
existing_cols_merged <- intersect(list_of_columns_proteins_order, names(proteinGroups_merged))
existing_cols_filtered <- intersect(list_of_columns_proteins_order, names(proteinGroups_filtered))
# Get union of columns that exist in either data frame
final_cols <- union(existing_cols_merged, existing_cols_filtered)
# Apply column selection
proteinGroups_merged <- proteinGroups_merged %>%
dplyr::select(all_of(final_cols))
proteinGroups_filtered <- proteinGroups_filtered %>%
dplyr::select(all_of(final_cols))
# Warning about any missing columns
missing_cols <- setdiff(list_of_columns_proteins_order, final_cols)
if(length(missing_cols) > 0) {
warning(paste("The following columns are missing from one or both data frames:",
paste(missing_cols, collapse=", ")))
}
# Prepare abundance data
qc_abundance <- proteinGroups_abundances %>%
pivot_longer(cols = everything(), names_to = "Sample", values_to = "Abundance") %>%
dplyr::mutate(
Abundance = na_if(Abundance, 0),
Abundance_log = log2(Abundance)
)
# Extract group from sample name
qc_abundance <- qc_abundance %>%
left_join(sample_list[, c("Sample", "Group")], by = "Sample") %>%
mutate(Group = factor(Group, levels = ordered_groups))
# Median intensity for reference line
overall_median <- median(qc_abundance$Abundance_log, na.rm = TRUE)
# QC violin per sample
qc_abundance_long <- proteinGroups_abundances %>%
pivot_longer(cols = everything(), names_to = "Sample", values_to = "Abundance") %>%
mutate(Abundance_log = log2(na_if(Abundance, 0))) %>%
left_join(sample_list[, c("Sample", "Group")], by = "Sample")
violin_qc_plot <- ggplot(qc_abundance_long, aes(x = Sample, y = Abundance_log, fill = Group)) +
geom_violin(trim = FALSE, alpha = 0.7) +
geom_boxplot(width = 0.1, fill = "white") +
geom_hline(yintercept = overall_median, color = "red", linetype = "dashed", size = 1) +
facet_wrap(~ Group, scales = "free_x") +
labs(title = "Protein Intensity Distribution per Sample",
x = "Sample", y = "Log2 Intensity") +
scale_fill_manual(values = group_colors) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8))
violin_qc_plot
# Save both PNG and SVG
ggsave(
filename = "violinplot_protein_abundance.png",
plot = violin_qc_plot,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "violinplot_protein_abundance.svg",
plot = violin_qc_plot,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
# Step 1: Prepare raw protein data
protein_df <- proteinGroups_filtered %>%
dplyr::select("Accession", all_of(sample_list$Sample))
rownames(protein_df) <- protein_df$Accession
protein_df <- protein_df[, -1]
# Step 2: Filtering (on raw intensities)
df_filtered <- filter_valids_gpt(
protein_df,
sample_list,
min_count = 2,
at_least_one = TRUE
)
# --- Create log-transformed version for plotting only ---
df_raw_log <- log2(as.matrix(df_filtered) + 1)
df_raw_log <- as.data.frame(df_raw_log)
df_raw_log$Type <- "Before Normalization"
# --- Keep raw for normalization ---
df_unlogged <- df_filtered
df_unlogged[df_unlogged == 0] <- NA # replace 0s with NA before normalization
normalized_list <- list(
"None" = normalize_df(df_unlogged, 0),
"Median" = normalize_df(df_unlogged, 1),
"Sum" = normalize_df(df_unlogged, 2),
"Batch" = normalize_df(df_unlogged, 3),
"Day" = normalize_df(df_unlogged, 4),
"Quantile Batch" = normalize_df(df_unlogged, 5),
"Quantile" = normalize_df(df_unlogged, 6)
)
normalized_list <- lapply(normalized_list, function(df) {
df[] <- lapply(df, function(col) {
if (is.numeric(col)) log2(col + 1e-6) else col
})
as.data.frame(df)
})
# Choose normalization to proceed
df_normalized <- as.data.frame(normalized_list[["Median"]]) # <-- change here if needed
df_normalized$Type <- "After Normalization"
# Step 5: Combine both for plotting
df_combined <- bind_rows(
df_raw_log %>%
rownames_to_column("Protein") %>%
dplyr::select(-Type) %>% # Remove Type before pivoting
pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
mutate(Type = "Before Normalization"),
df_normalized %>%
rownames_to_column("Protein") %>%
dplyr::select(-Type) %>%
pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
mutate(Type = "After Normalization")
)
# Step 6: Merge group info
df_combined <- left_join(df_combined, sample_list[, c("Sample", "Group")], by = "Sample")
df_combined$Type <- factor(df_combined$Type, levels = c("Before Normalization", "After Normalization"))
df_combined$Group <- factor(df_combined$Group, levels = ordered_groups)
# Step 7: Plot the normalization effect
# Compute one median Intensity per “Type” (facet):
median_by_type <- df_combined %>%
group_by(Type) %>%
summarize(facet_med = median(Intensity, na.rm = TRUE))
# Now build the combined violin + boxplot + median‐line plot:
plot_per_sample_violin <- ggplot(df_combined, aes(x = Sample, y = Intensity, fill = Group)) +
# 1) Draw per‐sample violins:
geom_violin(
trim = FALSE,
width = 1.0,
alpha = 0.6,
color = NA
) +
# 2) Draw a narrow boxplot inside each violin:
geom_boxplot(
width = 0.15,
fill = "white",
color = "black",
outlier.shape = NA,
lwd = 0.3
) +
# 3) Add per‐sample median dot (optional; you can remove if you prefer only the horizontal line):
stat_summary(
aes(group = Sample),
fun = median,
geom = "point",
shape = 21,
fill = "white",
color = "black",
size = 1.5
) +
# 4) Draw a single dashed horizontal line at each facet’s median:
geom_hline(
data = median_by_type,
aes(yintercept = facet_med),
color = "red",
linetype = "dashed",
size = 0.6
) +
# 5) Facet into one column: “Before” on top, “After” below
facet_wrap(~Type, ncol = 1, strip.position = "top") +
# 6) Color each violin by its Group
scale_fill_manual(values = group_colors) +
# 7) Labels
labs(
title = "Effect of Normalization on Protein Intensity",
x = "Sample",
y = expression(Log[2] ~ "Intensity")
) +
# 8) Theme tweaks for a cleaner look
theme_classic(base_size = 12) +
theme(
# Move the main title closer to the facets:
plot.title = element_text(face = "bold", hjust = 0.5, size = 14, margin = margin(b = 8)),
# Style the facet strip (white background, black border):
strip.background = element_rect(fill = "white", color = "black", size = 0.4),
strip.text = element_text(face = "bold", size = 11),
# Rotate sample names 90° under each violin, shrink font:
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4, size = 9),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 11, margin = margin(t = 6)),
axis.title.y = element_text(size = 11, margin = margin(r = 6)),
# Legend on the right (Groups colored consistently)
legend.position = "right",
legend.title = element_text(size = 11),
legend.text = element_text(size = 9),
# Reduce vertical spacing between facets so they sit closer:
panel.spacing.y = unit(0.5, "lines")
)
# 9) Print the plot to the Rmd
print(plot_per_sample_violin)
# Step 8: Imputation
imputation_mode <- 2 # 1 = MNAR only; 2 = MNAR + missForest
if (imputation_mode == 1) {
# Single-step: group-wise MNAR only
suppressMessages({
df_norm_valid_imp <- group_MNAR_impute_data(df_normalized, frequency_threshold = 2)
})
} else if (imputation_mode == 2) {
# Two-step: MNAR group-wise, then missForest
suppressMessages({
step1 <- group_MNAR_impute_data(df_normalized, frequency_threshold = 2) %>%
dplyr::select(!contains("imputed")) %>%
dplyr::select(where(is.numeric)) # removes Type safely
})
# Run missForest quietly
suppressMessages({
missforest_result <- missForest(as.matrix(step1), verbose = FALSE)
})
# Add imputation mask for later QC
missing_mask <- is.na(as.matrix(step1))
colnames(missing_mask) <- paste0("imputed_", colnames(step1))
df_norm_valid_imp <- as.data.frame(missforest_result$ximp)
df_norm_valid_imp <- cbind(df_norm_valid_imp, missing_mask)
}
# Save both PNG and SVG
ggsave(
filename = "Effect of Normalization on Protein Intensity.png",
plot = plot_per_sample_violin,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Effect of Normalization on Protein Intensity.svg",
plot = plot_per_sample_violin,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
# Prepare long-format data using the helper function (already in your functions_ver2.5.R)
df_long_imputed <- transform_imputed_to_long(df_norm_valid_imp)
# Relabel imputation source for clarity
df_long_imputed <- df_long_imputed %>%
mutate(
Type = ifelse(Source, "Imputed", "Original"),
Type = factor(Type, levels = c("Original", "Imputed")),
Group = sample_list$Group[match(Sample, sample_list$Sample)],
Group = factor(Group, levels = ordered_groups)
)
# Imputation effect plot
medians <- df_long_imputed %>%
filter(Type == "Imputed") %>%
group_by(Sample) %>%
summarise(median_intensity = median(Intensity, na.rm = TRUE))
# Faceted histogram to show distributions
hist_dist_missing_val <- ggplot(df_long_imputed, aes(x = Intensity)) +
geom_histogram(aes(fill = Type), binwidth = 1, position = "identity", alpha = 0.8) +
scale_fill_manual(
name = "Data Source",
values = c("Original" = "cyan3", "Imputed" = "firebrick1"),
labels = c("Original", "Imputed")
) +
facet_wrap(~Sample, scales = "free_y") +
labs(
title = "Distribution of Imputed vs Original Values",
x = "Intensity",
y = "Frequency"
) +
theme_classic(base_size = 12) +
theme(
strip.text = element_text(face = "bold"),
legend.position = "right",
legend.title = element_text(size = 11),
legend.text = element_text(size = 10)
)
# Print the histogram
print(hist_dist_missing_val)
# Save both PNG and SVG
ggsave(
filename = "Distribution of Intensities: Original vs Imputed.png",
plot = hist_dist_missing_val,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Distribution of Intensities: Original vs Imputed.svg",
plot = hist_dist_missing_val,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
# Join sample_list group info and clean Group
# Check critical objects
stopifnot(
exists("df_norm_valid_imp"),
exists("sample_list"),
all(sample_list$Sample %in% colnames(df_norm_valid_imp))
)
group_info <- sample_list$Group
names(group_info) <- sample_list$Sample
group_info_df <- data.frame(
Sample = names(group_info),
Group = as.character(group_info)
)
df_norm_valid_imp_long <- df_norm_valid_imp %>%
rownames_to_column("Protein") %>%
pivot_longer(
cols = -Protein,
names_to = "Sample",
values_to = "Intensity"
)
# Now join safely
qc_cv <- inner_join(df_norm_valid_imp_long, group_info_df, by = "Sample")
# Unlog intensity
qc_cv$IntensityUnlog <- 2^qc_cv$Intensity
# Calculate CV per protein per group
qc_cv_df <- qc_cv %>%
group_by(Protein, Group) %>%
summarise(cv = cv(IntensityUnlog, na.rm = TRUE), .groups = "drop") %>%
na.omit() %>%
mutate(Group = factor(Group, levels = ordered_groups))
# Enhanced plot
plot_violin_cv <- ggplot(qc_cv_df, aes(x = Group, y = cv, fill = Group)) +
geom_violin(trim = FALSE, alpha = 0.7, width = 0.8, color = NA) +
geom_boxplot(width = 0.15, fill = "white", color = "black", outlier.shape = NA) +
scale_fill_manual(values = group_colors) +
scale_y_continuous(expand = c(0, 0))+
geom_hline(yintercept = 20, linetype = "dashed", color = "red", linewidth = 1) +
labs(
x = "Group",
y = "Coefficient of Variation (%)",
title = "Technical Reproducibility Across Experimental Groups",
caption = paste("n =", format(nrow(qc_cv_df)/3), "proteins per group | CV = SD/mean * 100")
) +
theme_classic(base_size = 14) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1.1, size = 10),
axis.title.y = element_text(size = 12),
panel.grid.major.y = element_line(color = "gray90")
)
print(plot_violin_cv)
# Save both PNG and SVG
ggsave(
filename = "CV Violin Plot-Technical Reproducibility Across Experimental Groups.png",
plot = plot_violin_cv,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "CV Violin Plot-Technical Reproducibility Across Experimental Groups.svg",
plot = plot_violin_cv,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
cv_barplot <- qc_cv_df %>%
mutate(CV_Category = case_when(
cv < 10 ~ "< 10%",
cv >= 10 & cv < 20 ~ "10–20%",
cv >= 20 & cv < 30 ~ "20–30%",
cv >= 30 & cv <= 100 ~ "30–100%",
TRUE ~ "> 100%" # optional fallback
))
# Reorder levels
cv_barplot$CV_Category <- factor(cv_barplot$CV_Category,
levels = c("< 10%", "10–20%", "20–30%", "30–100%", "> 100%"))
# Summarize by group and CV bin
cv_barplot_summary <- cv_barplot %>%
filter(CV_Category != "> 100%") %>%
group_by(Group, CV_Category) %>%
summarise(Count = n(), .groups = "drop") %>%
group_by(Group) %>%
mutate(Percentage = Count / sum(Count) * 100)
# Define new elegant color palette
mycolors <- c(
"< 10%" = "#A6BDDB",
"10–20%" = "#74A9CF",
"20–30%" = "#2B8CBE",
"30–100%" = "#045A8D")
# Plot: Protein CV Category Distribution
CV_barplot <- ggplot(cv_barplot_summary, aes(x = Group, y = Percentage, fill = CV_Category)) +
geom_bar(stat = "identity", width = 0.85) +
scale_fill_manual(values = mycolors) +
labs(
y = "Percentage of Proteins",
fill = "CV Category",
title = "Protein CV Category Distribution"
) +
geom_text(
aes(label = sprintf("%.1f%%", Percentage)),
position = position_stack(vjust = 0.5),
size = 4.5, color = "white"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
axis.text.x = element_text(angle = 90, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.position = "right"
)
CV_barplot
# Save both PNG and SVG
ggsave(
filename = "Barplot Protein CV Category Distribution.png",
plot = CV_barplot,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "Barplot Protein CV Category Distribution.svg",
plot = CV_barplot,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
### QC: CV vs. Mean Intensity (2D Density Plot by Group)
# Remove rows that are entirely NA
df_norm_valid_imp <- df_norm_valid_imp[rowSums(is.na(df_norm_valid_imp)) < ncol(df_norm_valid_imp), ]
# Prepare CV stats — CV calculated on raw (unlogged) intensity
qc_stats <- df_norm_valid_imp %>%
rownames_to_column("Protein") %>%
pivot_longer(-Protein, names_to = "Sample", values_to = "Intensity") %>%
mutate(Intensity = 2^Intensity) %>% # ✅ reverse log2
left_join(sample_list, by = "Sample") %>%
filter(!is.na(Group)) %>%
group_by(Protein, Group) %>%
summarise(
mean_intensity = mean(Intensity, na.rm = TRUE),
sd_intensity = sd(Intensity, na.rm = TRUE),
cv = 100 * (sd_intensity / mean_intensity),
.groups = "drop"
) %>%
filter(is.finite(cv), is.finite(mean_intensity))
# Create hexbin plot with facets
hex_plot <- ggplot(qc_stats, aes(x = cv, y = log2(mean_intensity))) +
geom_hex(bins = 60) +
facet_wrap(~Group) +
scale_fill_viridis_c(option = "D") +
labs(
title = "2D Density Plot: CV vs Mean Intensity (by Group)",
x = "Coefficient of Variation (CV, %)",
y = "Mean Intensity (log2)"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.position = "right"
)
# Display the plot
print(hex_plot)
# Save both PNG and SVG
ggsave(
filename = "2D Density Plot: CV vs Mean Intensity.png",
plot = hex_plot,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "2D Density Plot: CV vs Mean Intensity.svg",
plot = hex_plot,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
# Select only 2 groups of interest
selected_groups <- c("Prototype", "MPSP")
# Debug: rebuild group_info
group_info <- sample_list$Group
names(group_info) <- sample_list$Sample
# Filter for selected groups
group_filter <- group_info %in% selected_groups
# Subset the data
df_stat <- df_norm_valid_imp[, names(group_info)[group_filter]] %>%
dplyr::select(!contains("imputed"))
group_info <- factor(group_info[group_filter], levels = selected_groups)
# T-test function
t.test.2 <- function(x, y) {
tryCatch({
if (length(unique(y)) != 2 || any(table(y) < 2)) {
return(NA)
}
t.test(x ~ y)$p.value
}, error = function(e) {
return(NA)
})
}
# Calculate p-values
p.values <- apply(df_stat , 1, t.test.2, group_info)
p.values.adj <- p.adjust(p.values, "BH")
proteins_stat <- cbind(df_stat, p.values, p.values.adj) %>% data.frame() %>%
rownames_to_column(var = "Accession")
if(!exists("df_description")) {
gene_name <- proteinGroups_filtered$Description
gene_name <- str_split(gene_name, pattern = "GN=")
collected_gene_names <- sapply(gene_name, function(x) str_split(x[2], pattern = " ")[[1]][1])
gene_name <- proteinGroups_filtered$Description
gene_name <- str_split(gene_name, pattern = "OS=")
collected_description <- sapply(gene_name, function(x) x[1])
df_description <- data.frame(
"Accession" = proteinGroups_filtered$Accession,
"Gene" = collected_gene_names,
"ShortDesc" = collected_description
)
}
The following steps were applied to the dataset before differential analysis:
Sample Selection:
Focused on two groups only: Prototype and
MPSP. Other groups, if present, were excluded.
Protein Filtering Criteria:
min_count = 2).Peptide Quality Control:
Peptides were retained with Sequest HT XCorr ≥
1.5.
Normalization Strategy:
Applied median normalization (log2-transformed after
normalization).
Other normalization strategies (quantile, sum, batch, etc.) are
available but median is used by default.
Imputation Strategy:
A 2-step imputation was applied to account for missing
values:
Intensity Visualization:
Violin plots and heatmaps of protein intensities were generated
before and after normalization, along with QC for
missingness and CV.
Statistical Testing:
Performed two-sample t-tests between the two
groups.
Adjusted p-values were computed using Benjamini-Hochberg
(FDR) correction.
Ratio Calculation:
Log2-transformed fold changes were computed as:
log2(mean Intensity Prototype / mean Intensity MPSP)
Differential Significance Thresholds:
Proteins are labeled “Significant” if:
|log2FC| > 1, andadjusted p-value ≤ 0.05library(ggplot2)
library(plotly)
library(ggrepel)
# Set default strict criteria
df_volcano <- calculate_ratios_volcano_p_adjust(df_norm_valid_imp)
df_volcano$Significance <- ifelse(df_volcano$p.values.adj <= 0.05 & abs(df_volcano$RatioLog) > 1,
"Significant", "Not significant")
# Define color palette
volcano_colors <- c("Significant" = "#D73027", "Not significant" = "darkgray")
# Volcano plot
volcano_plot <- ggplot(df_volcano, aes(x = RatioLog, y = -log10(p.values.adj),
color = Significance,
text = paste0("Gene: ", Gene, "<br>Log2FC: ", round(RatioLog, 2),
"<br>adj.p: ", signif(p.values.adj, 3)))) +
geom_point(alpha = 0.7, size = 0.8) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
scale_color_manual(values = volcano_colors) +
labs(
title = paste0("Volcano Plot: ", GroupA, " vs ", GroupB),
x = paste0("Log2(", ratio_name, ")"),
y = "-log10(Adjusted p-value)") +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.caption = element_text(size = 10, face = "italic"),
legend.position = "right"
)
# Convert to interactive
volcano_interactive <- plotly::ggplotly(volcano_plot, tooltip = "text")
volcano_interactive
# Save both PNG and SVG
ggsave(
filename = "volcano_plot.png",
plot = volcano_plot,
path = export_dir,
width = 6,
height = 5,
units = "in",
dpi = 300
)
ggsave(
filename = "volcano_plot.svg",
plot = volcano_plot,
path = export_dir,
width = 6,
height = 5,
units = "in",
device = "svg"
)
openxlsx::write.xlsx(x = df_volcano, file = "proteins_df_stat.xlsx")
save.image(file = paste0(Sys.Date(), "_NewStandardReport.RData"))
This report documents the complete data analysis pipeline for the two conditions mass spectrometry dataset, from data loading and filtering to normalization, imputation, statistical testing, and visualization.
Report generated on r Sys.Date() using RMarkdown.